# Ryan Copus, Ryan Hübert and Paige Pellaton
# "Trading Diversity? Judicial Diversity and Case Outcomes in Federal Courts"
# American Political Science Review

# File name: chp_apsr_01_plaintiffs1.py
# Last revision date: May 30, 2024
# Questions or comments? Contact Ryan Hübert: https://ryanhubert.com/

# What does this script do?
# This script identifies which plaintiffs are humans and extracts their names.

# Last pre-print execution of this code:
# > Date: March 24, 2024
# > Machine: MacBook Pro 14" (2021 model) with Apple M1 Max chip and 64 GB RAM
# > OS: macOS Sonoma 14.4
# > Python: version 3.10

################################################################################
# IMPORTANT NOTE ABOUT DATA LIMITATIONS
################################################################################

# In the publicly available version of the replication files, all plaintiff
# names have been redacted. As a result, execution of this script will not be
# successful. Please contact the author(s) for more information.

################################################################################
# Before you run this script...
################################################################################

# You need to download the Stanford NER data and save it in the `data`
# subdirectory in order for this script to run:
# https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip

# Before you run this script, you must start the NER server in a terminal window
# by running these lines, after cd to the directory containing the Replication directory
# > cd /data/stanford-ner-2020-11-17
# > java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz
# See: https://stackoverflow.com/questions/33748554/how-to-speed-up-ne-recognition-with-stanford-ner-with-python-nltk

################################################################################
# Load packages and set options
################################################################################

import re
import pandas as pd
from sner import Ner
import os
from multiprocessing import Pool, cpu_count

################################################################################
# Define functions we will use below
################################################################################

# Define a function that tags human names using Stanford NER.
def TagFunction(n):
    """
    Tag names using the Stanford NER and return a dataframe
    """
    st = Ner(host='localhost', port=9199)
    out = st.get_entities(n)
    out = "".join(["<" + x[0].strip() + " {" + x[1] + "}>" for x in out])
    return pd.DataFrame({"string": [n], "tagged_string": [out]})

# Define a function that splits human names after tagging them.
def SplitTagged(row):
    try:
        out = re.findall("<([^>]+)>", row["tagged_string"])
        out = [(x.split(" {")[0],x.split(" {")[1][:-1]) for x in out]
        names = [x[0].strip() if x[1] == "PERSON" else "<...>" for x in out]
        names = [x.strip() for x in " ".join(names).split("<...>") if not re.search("^(JR\.?|SR\.?|[IV]+|MRS?\.?|MS\.?|DR\.?)?$", x.strip().upper())]
        names = pd.DataFrame({"name": [row["string"]] * len(names),
                              "count": [len(names)] * len(names),
                              "name_tagged": names})
    except:
        names = pd.DataFrame({"name": [row["string"]],
                              "count": [pd.NA],
                              "name_tagged": [pd.NA]})
    return names

# Define a function that removes non-human names.
def CullFunction(tok_frame):
    """
    Find which names do not have tokens indicating they are not human
    """
    return (~tok_frame[1].str.contains(tok_frame[0], regex = True))

# Define a function that splits first, middle and last names.
def SplitFirstLast(obj):
    name_tagged = obj[1]
    tmp = name_tagged.split(" ")
    ## Combine last names that have multiple words
    if len(tmp) >= 2 and tmp[-2].upper() in ["VAN", "VON", "ST", "ST.", "DE", "LA", "DER", "DEL", "DELLA"]:
        tmp = [x for x in tmp[0:-2]] + [" ".join(tmp[-2:])]
    ## Get rid of strings with just initials
    if len([x for x in tmp if re.search("[A-z]{2,}", x)]) == 0:
        tmp = ["", "", ""]
    elif len(tmp) == 1:
        tmp = ["", "", tmp[0]]
    elif len(tmp) == 2:
        tmp = [tmp[0], '', tmp[1]]
    elif len(tmp) == 3:
        tmp = [tmp[0], tmp[1], tmp[2]]
    else:
        if re.search("^[A-Z]\.? [A-Z]([a-z]| )", name_tagged):
            tmp = [" ".join(tmp[0:2]), " ".join(tmp[2:-1]), tmp[-1]]
        else:
            tmp = [tmp[0], " ".join(tmp[1:-1]), tmp[-1]]

    tmp = [x if re.search("[A-z]{2,}", x) else "" for x in tmp]

    return pd.DataFrame({"index": [obj[0]],"fn": [tmp[0]], "mn": [tmp[1]], "ln": [tmp[2]]})

################################################################################
# In the main workspace, use multiple cores to do the tagging
################################################################################

if __name__ == "__main__":
    ncores = cpu_count()
    redo_tagging = True if input("Would you like to redo the Stanford NER string tagging? (y/n) ") == "y" else False
    print("Now cleaning the plaintiff names using " + str(ncores) + " cores of your machine...")

    try:
        TagFunction("Joe Biden")
    except:
        print("\n" + (80*"+") +
              "\nError! Did you start the NER server before you ran this script?\n"
              "See the instructions at the top of the .py file.\n" + (80*"+") +
              "\n\nNow printing the python exception for your inspection:\n")
        raise

    # Get current working directory
    wdir = re.sub("[Cc]ode","",os.getcwd())

    # Create an output directory
    odir = wdir + "/outs"
    if not os.path.exists(odir):
        os.mkdir(odir)

    # Load the names
    pf = pd.read_csv(wdir + "/data/chp_apsr_plaintiff_data.csv", dtype = "object")

    # Create a dataframe where each row is a unique name string
    pf = pf.loc[:, "name"].value_counts().reset_index().rename(columns = {"index": "name", "name":"count"})

    # Save the original name from the pf dataframe
    pf["name_orig"] = pf["name"]

    # Some names are anonymized
    pf = pf.loc[(~pf["name"].str.upper().str.strip().str.contains("(?:[^A-z]|^)(?:JOHN [RD]OE(?:\(?S\)?)?|JANE [RD]OE(?:\(?S\)?)?|[RD]OE(?:\(?S\)?)?)(?:[^A-z]|$)",regex=True)),:]
    pf = pf.loc[(~pf["name"].str.upper().str.strip().str.contains("^UNKNOWN$",regex=True)),:]

    # Clean up bad spacing and quotes
    pf["name"] = pf["name"].str.replace('\"'," ", regex = True).str.strip()
    pf["name"] = pf["name"].str.replace("\s+"," ", regex = True).str.strip()

    # pf = pf.loc[0:100,:]

    # Use a custom dictionary to identify strings we do not think are human names
    # This list was compiled by Paige Pellaton
    toks = open(wdir + "/data/litigant_tokens_py.txt","r").read().split("\n")
    print(" > Removing plaintiff names with apparently non-human tokens (pass 1)...")
    rmvd = len(pf)
    pool = Pool(processes=ncores)
    test = pool.map(CullFunction, [(x,pf["name"]) for x in toks])
    test = pd.concat([x for x in test], axis=1)
    pool.close()
    pf = pf.loc[(test.eq(True).all(axis=1)),:]
    rmvd = rmvd - len(pf)
    print("   >> Removed " + str(rmvd) + " plaintiff names...")
    del test

    # Remove strings with "and" -- most likely a business
    pf = pf.loc[(~pf["name"].str.upper().str.contains("(?: |^)(?:\&|D/B/A|DBA)(?: |$)")),:]
    pf = pf.loc[(~pf["name"].str.contains("(?: |^)(?:and)(?: |$)")),:]

    # Reset the index
    pf = pf.reset_index(drop=True)

    # Use the Stanford NER to tag plaintiffs who have apparently human names
    fname = odir + "/chp_apsr_tagged_tokens.csv"
    if (not os.path.exists(fname)) or redo_tagging:
        print(" > Using Stanford NER to tag human names...")
        tf = pd.DataFrame()
        for index in range(0,len(pf)):
            print("   >> Tagging plaintiff " + str(index+1) + " of " + str(len(pf)) + "...")
            tf = pd.concat([tf, TagFunction(pf["name"].iloc[index])], axis=0)
        tf.to_csv(fname, index=False, header=True)

    print(" > Cleaning up tagged results from Stanford NER...")
    nf = pd.read_csv(fname, dtype="object")
    # nf = map(SplitTagged, [row for index, row in nf.iterrows()])
    # nf = pd.concat([x for x in nf], axis = 0)
    # for x in nf:
    #     print(x)
    if (~nf["string"].isin(pf["name"])).any() or (~pf["name"].isin(nf["string"])).any():
        raise Exception
    pool = Pool(processes=ncores)
    nf = pool.map(SplitTagged, [row for index, row in nf.iterrows()])
    pool.close()
    nf = pd.concat([x for x in nf], axis = 0)
    print("   >> Deleted " + str(len(nf.loc[nf["count"].isnull(),:])) + " tagged plaintiff name(s) because they were blank... ")
    nf = nf.loc[(~nf["count"].isnull()), :]
    nf = nf.merge(pf.loc[:,["name","name_orig"]].drop_duplicates(), how = "left", on = "name")
    nf = nf.loc[:, ["name_orig","name","count","name_tagged"]]
    nf = nf.reset_index(drop=True)

    # Do some additional cleaning of tagged names to remove suffixes, titles, honorifics, etc.
    nf["name_tagged"] = nf["name_tagged"].str.replace("( [SJ][Rr]\.?|[IV]{2,})( |$)"," ", regex = True).str.strip()
    nf["name_tagged"] = nf["name_tagged"].str.replace("( |^)(Esq\.?|Sgt\.?|Sergeant|Captain|Capt\.?|Warden|Dr\.|Lt\.?|Lieutenant|Honorable|Hon\.?|Judge|Correctional Facility)( |$)", " ", regex = True).str.strip()
    nf["name_tagged"] = nf["name_tagged"].str.replace("( |^)(Esq\.?|Sgt\.?|Sergeant|Captain|Capt\.?|Warden|Dr\.|Lt\.?|Lieutenant|Honorable|Hon\.?|Judge|Correctional Facility)( |$)".upper(), " ", regex = True).str.strip()
    nf["name_tagged"] = nf["name_tagged"].apply(lambda x: re.sub(" [A-Z]{2,}$","", x).strip() if re.search("[a-z]",x) else x.strip())
    nf["name_tagged"] = nf["name_tagged"].str.replace(" \- ","-",regex=True)
    nf["name_tagged"] = nf["name_tagged"].str.replace("[\(\)]"," ",regex=True).str.replace(" +"," ",regex=True)

    # Use a custom dictionary to identify strings we do not think are human names (pass 2)
    print(" > Removing plaintiff names with apparently non-human tokens (pass 2)...")
    rmvd = len(nf)
    pool = Pool(processes=ncores)
    test = pool.map(CullFunction, [(x, nf["name_tagged"]) for x in toks])
    test = pd.concat([x for x in test], axis=1)
    pool.close()
    nf = nf.loc[(test.eq(True).all(axis=1)), :]
    rmvd = rmvd - len(nf)
    print("   >> Removed " + str(rmvd) + " plaintiff names...")
    del test
    nf = nf.reset_index(drop=True)

    # Split the first, middle and last names
    pool = Pool(processes=ncores)
    tf = pool.map(SplitFirstLast, [(index,row["name_tagged"]) for index, row in nf.iterrows()])
    pool.close()
    tf = pd.concat([x for x in tf], axis=0)
    tf = tf.sort_values("index").drop("index",axis=1).reset_index(drop=True)
    nf = pd.concat([nf,tf], axis = 1)

    nf["human"] = 1

    # Final merge and save
    pf1 = pd.read_csv(wdir + "/data/chp_apsr_plaintiff_data.csv", dtype = "object")
    pf1 = pf1.merge(nf.loc[:,["name_orig","human","fn","mn","ln"]], how = "left", left_on = "name", right_on = "name_orig")
    pf1 = pf1.loc[:,["file","CASE_ID","pid","party_type","COUNTY_CODE","human","fn","mn","ln"]].drop_duplicates().reset_index(drop=True)

    pf1.loc[pf1["human"].isnull(),"human"] = 0
    pf1["human"] = pf1["human"].astype(int)

    fname = odir + "/chp_apsr_plaintiffs.csv"
    pf1.to_csv(fname, index=False)